Purpose

This document will explore the DAPC analyses for the data given different populations.

Required packages and data

library(PramCurry)
## Loading required package: poppr
## Loading required package: adegenet
## Loading required package: ade4
##    ==========================
##     adegenet 1.4-2 is loaded
##    ==========================
## 
##  - to start, type '?adegenet'
##  - to browse adegenet website, type 'adegenetWeb()'
##  - to post questions/comments: adegenet-forum@lists.r-forge.r-project.org
## 
## 
## This is poppr version 1.1.2.99.70. To get started, type package?poppr
library(reshape2)
library(ggplot2)
library(ape)
library(poppr)
library(adegenet)
library(igraph)
## 
## Attaching package: 'igraph'
## 
## The following object is masked from 'package:ape':
## 
##     edges
options(stringsAsFactors = FALSE)
data(ramdat)
data(pop_data)
data(myPal)
sessionInfo()
## R version 3.1.2 (2014-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] igraph_0.7.1      ape_3.1-4.5       ggplot2_1.0.0     reshape2_1.4     
## [5] PramCurry_1.0     poppr_1.1.2.99-70 adegenet_1.4-2    ade4_1.6-2       
## 
## loaded via a namespace (and not attached):
##  [1] assertthat_0.1      bitops_1.0-6        boot_1.3-11        
##  [4] caTools_1.17.1      colorspace_1.2-4    digest_0.6.4       
##  [7] dplyr_0.2           evaluate_0.5.5      fastmatch_1.0-4    
## [10] formatR_1.0         gdata_2.13.3        ggmap_2.3          
## [13] grid_3.1.2          gtable_0.1.2        gtools_3.4.1       
## [16] htmltools_0.2.6     httpuv_1.3.0        knitr_1.6          
## [19] labtools_0.4.1      lattice_0.20-29     lubridate_1.3.3    
## [22] mapproj_1.2-2       maps_2.3-9          MASS_7.3-34        
## [25] Matrix_1.1-4        memoise_0.2.1       munsell_0.4.2      
## [28] nlme_3.1-117        parallel_3.1.2      pegas_0.6          
## [31] permute_0.8-3       phangorn_1.99-7     plotrix_3.5-7      
## [34] plyr_1.8.1          png_0.1-7           proto_0.3-10       
## [37] Rcpp_0.11.2         RgoogleMaps_1.2.0.6 rjson_0.2.14       
## [40] RJSONIO_1.3-0       rmarkdown_0.3.10    scales_0.2.4       
## [43] shiny_0.10.1        stringr_0.6.2       tools_3.1.2        
## [46] vegan_2.0-10        xtable_1.7-4        yaml_2.1.13

DAPC analysis on ZONE2

# plot_posterior <- function(da.object, gid, pal){
#   posterior <- da.object$posterior
#   names(dimnames(posterior)) <- c("sample", "population")
#   to_merge <- data.frame(list(sample = dimnames(posterior)$sample, 
#                               oldPopulation = pop(gid)))
#   post <- melt(posterior, value.name = "probability")
#   post <- merge(post, to_merge)
#   if (is.numeric(post$sample)){
#     post$sample <- factor(post$sample, levels = unique(post$sample))
#   }
#   outPlot <- ggplot(post, aes(x = sample, fill = population, y = probability)) + 
#     geom_bar(stat = "identity", position = "fill", width = 1) + 
#     theme_classic() + 
#     theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
#     scale_y_continuous(expand = c(0, 0)) +
#     scale_x_discrete(expand = c(0, 0)) +
#     facet_wrap(~oldPopulation, scales = "free_x", drop = TRUE, ncol = 1) +
#     scale_fill_manual(values = pal)
#   return(outPlot)
# }

I have originally performed these by utilizing the adegenetServer() function. Since it is interactive, I will only post the code to do it here:

setpop(ramdat) <- ~ZONE2
save(ramdat, file = "zone2.rda")
adegenetServer()

This will give me the tools to look at things like the composition plot and the cross-validation plot.

Cross-validation

setpop(ramdat) <- ~ZONE2
xval <- xvalDapc(ramdat@tab, pop(ramdat), n.pca.max = 15, result = "overall", n.rep = 1000)

plot of chunk Z2xval

## NULL
xval[-1]
## $`Median and Confidence Interval for Random Chance`
##   2.5%    50%  97.5% 
## 0.1178 0.1421 0.1772 
## 
## $`Mean Successful Assignment by Number of PCs of PCA`
##      1      2      3      4      5      6      7      8      9     10 
## 0.6187 0.6910 0.7274 0.7281 0.7369 0.7620 0.7692 0.7819 0.7837 0.7795 
##     11     12     13     14     15 
## 0.7789 0.8033 0.8010 0.7958 0.7977 
## 
## $`Number of PCs Achieving Highest Mean Success`
## [1] "12"
## 
## $`Root Mean Squared Error by Number of PCs of PCA`
##      1      2      3      4      5      6      7      8      9     10 
## 0.3821 0.3107 0.2746 0.2738 0.2652 0.2397 0.2327 0.2200 0.2182 0.2224 
##     11     12     13     14     15 
## 0.2229 0.1987 0.2011 0.2062 0.2043 
## 
## $`Number of PCs Achieving Lowest MSE`
## [1] "12"
## 
## $DAPC
##  #################################################
##  # Discriminant Analysis of Principal Components #
##  #################################################
## class: dapc
## $call: dapc.data.frame(x = as.data.frame(x), grp = ..1, n.pca = ..2, 
##     n.da = ..3)
## 
## $n.pca: 12 first PCs of PCA used
## $n.da: 6 discriminant functions saved
## $var (proportion of conserved variance): 0.976
## 
## $eig (eigenvalues): 2022 110.1 57.13 49.19 17.73 ...
## 
##   vector    length content                   
## 1 $eig      6      eigenvalues               
## 2 $grp      513    prior group assignment    
## 3 $prior    7      prior group probabilities 
## 4 $assign   513    posterior group assignment
## 5 $pca.cent 31     centring vector of PCA    
## 6 $pca.norm 31     scaling vector of PCA     
## 7 $pca.eig  25     eigenvalues of PCA        
## 
##   data.frame    nrow ncol
## 1 $tab          513  12  
## 2 $means        7    12  
## 3 $loadings     12   6   
## 4 $ind.coord    513  6   
## 5 $grp.coord    7    6   
## 6 $posterior    513  7   
## 7 $pca.loadings 31   12  
## 8 $var.contr    31   6   
##   content                                          
## 1 retained PCs of PCA                              
## 2 group means                                      
## 3 loadings of variables                            
## 4 coordinates of individuals (principal components)
## 5 coordinates of groups                            
## 6 posterior membership probabilities               
## 7 PCA loadings of original variables               
## 8 contribution of original variables

Running and plotting

setpop(ramdat) <- ~ZONE2
(z2.dapc <- dapc(ramdat, n.pca = 12, n.da = 6))
##  #################################################
##  # Discriminant Analysis of Principal Components #
##  #################################################
## class: dapc
## $call: dapc.genind(x = ramdat, n.pca = 12, n.da = 6)
## 
## $n.pca: 12 first PCs of PCA used
## $n.da: 6 discriminant functions saved
## $var (proportion of conserved variance): 0.976
## 
## $eig (eigenvalues): 2022 110.1 57.13 49.19 17.73 ...
## 
##   vector    length content                   
## 1 $eig      6      eigenvalues               
## 2 $grp      513    prior group assignment    
## 3 $prior    7      prior group probabilities 
## 4 $assign   513    posterior group assignment
## 5 $pca.cent 31     centring vector of PCA    
## 6 $pca.norm 31     scaling vector of PCA     
## 7 $pca.eig  25     eigenvalues of PCA        
## 
##   data.frame    nrow ncol
## 1 $tab          513  12  
## 2 $means        7    12  
## 3 $loadings     12   6   
## 4 $ind.coord    513  6   
## 5 $grp.coord    7    6   
## 6 $posterior    513  7   
## 7 $pca.loadings 31   12  
## 8 $var.contr    31   6   
##   content                                          
## 1 retained PCs of PCA                              
## 2 group means                                      
## 3 loadings of variables                            
## 4 coordinates of individuals (principal components)
## 5 coordinates of groups                            
## 6 posterior membership probabilities               
## 7 PCA loadings of original variables               
## 8 contribution of original variables
loadingplot(z2.dapc$var.contr, axis = 1)

plot of chunk unnamed-chunk-5

loadingplot(z2.dapc$var.contr, axis = 2)

plot of chunk unnamed-chunk-5

png("loading_figure.png", width = 183, height = 88, units = "mm", res = 300)
loadingplot(z2.dapc$var.contr, axis = 1)
dev.off()
## pdf 
##   2
popPal <- char2pal(ramdat@pop.names)
scatter(z2.dapc, cex = 2, legend = TRUE, clabel = FALSE, col = popPal,
        posi.leg = "bottomleft", scree.pca = TRUE, posi.pca = "topright",
        posi.da = "bottomright", cleg = 0.75, xax = 1, yax = 2, inset.solid = 1, 
        ratio.pca = 0.2, ratio.da = 0.2)

plot of chunk unnamed-chunk-5

png("no_nursery_DAPC.png", width = 183, height = 183, units = "mm", res = 300)
scatter(z2.dapc, cex = 2, legend = TRUE, clabel = FALSE, col = popPal,
        posi.leg = "bottomright", scree.pca = TRUE, posi.pca = "topright",
        posi.da = "bottomleft", cleg = 0.75, xax = 1, yax = 2, inset.solid = 1, 
        ratio.pca = 0.2, ratio.da = 0.2)
dev.off()
## pdf 
##   2
plot_posterior(z2.dapc, ramdat, popPal)

plot of chunk unnamed-chunk-6

Removing biases

Here, I am removing the Cape Sebastian isolates as well as the populations with less than 10 isolates.

noseb <- popsub(ramdat, blacklist = "HunterCr", drop = FALSE)
noseb.gt.10 <- selPopSize(noseb, n = 10)
# save(noseb.gt.10, file = "nosebpr.rda")
noseb.xval <- xvalDapc(noseb.gt.10@tab, pop(noseb.gt.10), n.pca.max = 15, 
                       result = "overall", n.rep = 1000)

plot of chunk nosebXval

## NULL
noseb.xval[-1]
## $`Median and Confidence Interval for Random Chance`
##   2.5%    50%  97.5% 
## 0.1633 0.2004 0.2350 
## 
## $`Mean Successful Assignment by Number of PCs of PCA`
##      1      2      3      4      5      6      7      8      9     10 
## 0.6048 0.6281 0.6700 0.6952 0.7389 0.7643 0.7667 0.7690 0.7594 0.7642 
##     11     12     13     14     15 
## 0.7829 0.7826 0.7808 0.7842 0.7855 
## 
## $`Number of PCs Achieving Highest Mean Success`
## [1] "15"
## 
## $`Root Mean Squared Error by Number of PCs of PCA`
##      1      2      3      4      5      6      7      8      9     10 
## 0.3965 0.3760 0.3346 0.3115 0.2666 0.2420 0.2396 0.2377 0.2469 0.2421 
##     11     12     13     14     15 
## 0.2237 0.2237 0.2261 0.2225 0.2214 
## 
## $`Number of PCs Achieving Lowest MSE`
## [1] "15"
## 
## $DAPC
##  #################################################
##  # Discriminant Analysis of Principal Components #
##  #################################################
## class: dapc
## $call: dapc.data.frame(x = as.data.frame(x), grp = ..1, n.pca = ..2, 
##     n.da = ..3)
## 
## $n.pca: 15 first PCs of PCA used
## $n.da: 4 discriminant functions saved
## $var (proportion of conserved variance): 0.987
## 
## $eig (eigenvalues): 185.4 99.69 66.05 29.63  vector    length content                   
## 1 $eig      4      eigenvalues               
## 2 $grp      443    prior group assignment    
## 3 $prior    5      prior group probabilities 
## 4 $assign   443    posterior group assignment
## 5 $pca.cent 31     centring vector of PCA    
## 6 $pca.norm 31     scaling vector of PCA     
## 7 $pca.eig  24     eigenvalues of PCA        
## 
##   data.frame    nrow ncol
## 1 $tab          443  15  
## 2 $means        5    15  
## 3 $loadings     15   4   
## 4 $ind.coord    443  4   
## 5 $grp.coord    5    4   
## 6 $posterior    443  5   
## 7 $pca.loadings 31   15  
## 8 $var.contr    31   4   
##   content                                          
## 1 retained PCs of PCA                              
## 2 group means                                      
## 3 loadings of variables                            
## 4 coordinates of individuals (principal components)
## 5 coordinates of groups                            
## 6 posterior membership probabilities               
## 7 PCA loadings of original variables               
## 8 contribution of original variables
(noseb.dapc <- dapc(noseb.gt.10, n.pca = 15, n.da = 4))
##  #################################################
##  # Discriminant Analysis of Principal Components #
##  #################################################
## class: dapc
## $call: dapc.genind(x = noseb.gt.10, n.pca = 15, n.da = 4)
## 
## $n.pca: 15 first PCs of PCA used
## $n.da: 4 discriminant functions saved
## $var (proportion of conserved variance): 0.987
## 
## $eig (eigenvalues): 185.4 99.69 66.05 29.63  vector    length content                   
## 1 $eig      4      eigenvalues               
## 2 $grp      443    prior group assignment    
## 3 $prior    5      prior group probabilities 
## 4 $assign   443    posterior group assignment
## 5 $pca.cent 31     centring vector of PCA    
## 6 $pca.norm 31     scaling vector of PCA     
## 7 $pca.eig  24     eigenvalues of PCA        
## 
##   data.frame    nrow ncol
## 1 $tab          443  15  
## 2 $means        5    15  
## 3 $loadings     15   4   
## 4 $ind.coord    443  4   
## 5 $grp.coord    5    4   
## 6 $posterior    443  5   
## 7 $pca.loadings 31   15  
## 8 $var.contr    31   4   
##   content                                          
## 1 retained PCs of PCA                              
## 2 group means                                      
## 3 loadings of variables                            
## 4 coordinates of individuals (principal components)
## 5 coordinates of groups                            
## 6 posterior membership probabilities               
## 7 PCA loadings of original variables               
## 8 contribution of original variables
scatter(noseb.dapc, cex = 2, legend = TRUE, clabel = FALSE, 
        col = popPal[noseb.gt.10@pop.names], scree.pca = TRUE,
        posi.leg = "bottomleft", posi.pca = "topleft",
        cleg = 0.75)

plot of chunk unnamed-chunk-8

plot_posterior(noseb.dapc, noseb.gt.10, popPal)

plot of chunk unnamed-chunk-9

Predicting sources.

Since we now have a model from our well represented populations, we can use this to predict the sources of individuals from these populations. Let’s see if they match up

to_test <- noseb@pop.names[!noseb@pop.names %in% noseb.gt.10@pop.names]
testpop <- popsub(noseb, to_test, drop = FALSE)
predictions <- predict.dapc(noseb.dapc, newdata = testpop)
plot_posterior(predictions, testpop, popPal)

plot of chunk unnamed-chunk-10

to_test <- ramdat@pop.names[!ramdat@pop.names %in% noseb.gt.10@pop.names]
testpop <- popsub(ramdat, to_test, drop = FALSE)
predictions <- predict.dapc(noseb.dapc, newdata = testpop)
plot_posterior(predictions, testpop, popPal)

plot of chunk unnamed-chunk-11

Using 95% inertia ellipses.

scatter(noseb.dapc, cex = 2, legend = TRUE, clabel = FALSE, cellipse = 2.5,
        col = popPal[noseb.gt.10@pop.names], scree.pca = FALSE,
        posi.leg = "bottomleft", posi.pca = "topleft",
        cleg = 0.75, bg="white", scree.da=0, pch=20, 
        xlim = c(-3, 7), ylim = c(-3, 3.5), solid = 0.5)
par(xpd=TRUE)
# points(noseb.dapc$ind.coord[, 1], noseb.dapc$ind.coord[, 2], pch=20,
#        col = transp(popPal[noseb.gt.10@pop.names]), cex = 2)
points(predictions$ind.scores[, 1], predictions$ind.scores[, 2], 
       pch = 20, col = transp("black", 0.2), cex = 3)
points(predictions$ind.scores[, 1], predictions$ind.scores[, 2], 
       pch = as.character(pop(testpop)),
       col = transp(popPal[as.character(pop(testpop))], 0.7), cex = 1)
add.scatter.eig(noseb.dapc$eig, 15, 1, 2, posi="bottomright", inset=.02)

plot of chunk unnamed-chunk-12

sebpredmat <- matrix(c(
  rowMeans(apply(predictions$posterior[1:66, ], 1, function(x) x >= 0.5)),
  rowMeans(apply(predictions$posterior[1:66, ], 1, function(x) x >= 0.95)),
  rowMeans(apply(predictions$posterior[1:66, ], 1, function(x) x >= 0.99)),
  rowMeans(apply(predictions$posterior[1:66, ], 1, function(x) x >= 0.999))),
  ncol = 4)
dimnames(sebpredmat) <- list(Population = colnames(predictions$posterior),
                             `membership probability` = c("50%", "95%", "99%", 
                                                          "99.9%"))
signif(sebpredmat, 3)
##             membership probability
## Population   50% 95%   99% 99.9%
##   JHallCr      0   0 0.000 0.000
##   NFChetHigh   0   0 0.000 0.000
##   Coast        1   1 0.985 0.909
##   Winchuck     0   0 0.000 0.000
##   ChetcoMain   0   0 0.000 0.000